Dataset can be downloaded from Video Game Sales
This dataset contains a list of video games with sales greater than 100,000 copies. It was generated by a scrape of vgchartz.com
The dataset contains 11 columns. The names and data types are as follows:
# Load important packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict
# Load Dataset
df = pd.read_csv("dataset/vgsales.csv", encoding="ISO-8859-1", low_memory=False)
df.head()
df.shape
msno.matrix(df)
missing_data = df.isnull()
for column in missing_data.columns.values.tolist():
print(column)
print (missing_data[column].value_counts())
print("")
df.dropna(inplace=True)
df.isnull().sum()
df.shape
df.dtypes
# Coverting "Year" column to datetime
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df['Year'].dtypes
df.head()
df['year'] = df['Year'].dt.year
df = df.drop(['Year'], axis=1)
df.head()
df_num_unique = df[['Genre', 'Publisher', 'Platform', 'Name']]
print(df_num_unique.nunique())
df_sales = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
df_sales.describe()
genre_count = pd.DataFrame(df['Genre'].value_counts())
genre_percent = pd.DataFrame(df['Genre'].value_counts(normalize = True))
genre_df = genre_count.merge(genre_percent, left_index = True, right_index = True)
genre_df['Genres'] = genre_df.index
genre_df.columns = ['Count', 'Percent','Genres']
genre_df = genre_df.reset_index().drop('index', axis = 1)
genre_df.head(10)
df["Genre"].value_counts().plot.barh()
publisher_count = pd.DataFrame(df['Publisher'].value_counts())
publisher_percent = pd.DataFrame(df['Publisher'].value_counts(normalize = True))
publisher_df = publisher_count.merge(publisher_percent, left_index = True, right_index = True)
publisher_df['Publisher'] = publisher_df.index
publisher_df.columns = ['Count', 'Percent','Publisher']
publisher_df = publisher_df.reset_index().drop('index', axis = 1)
publisher_df.head(10)
ea_na = df[(df["Publisher"] == "Electronic Arts")]
plt.figure(figsize=(20, 6))
sns.lineplot(data=ea_na, x="year", y="NA_Sales", hue="Genre")
ea_eu = df[(df["Publisher"] == "Electronic Arts")]
plt.figure(figsize=(20, 6))
sns.lineplot(data=ea_eu, x="year", y="EU_Sales", hue="Genre")
ea_na = df[(df["Publisher"] == "Electronic Arts")]
plt.figure(figsize=(10, 6))
sns.lineplot(data=ea_na, x="year", y="JP_Sales", hue="Genre")
ea_rest = df[(df["Publisher"] == "Electronic Arts")]
plt.figure(figsize=(20, 6))
sns.lineplot(data=ea_rest, x="year", y="Other_Sales", hue="Genre")
ea_global = df[(df["Publisher"] == "Electronic Arts")]
plt.figure(figsize=(20, 6))
sns.lineplot(data=ea_global, x="year", y="Global_Sales", hue="Genre")
platform_count = pd.DataFrame(df['Platform'].value_counts())
platform_percent = pd.DataFrame(df['Platform'].value_counts(normalize = True))
platform_df = platform_count.merge(platform_percent, left_index = True, right_index = True)
platform_df['Platform'] = platform_df.index
platform_df.columns = ['Count', 'Percent','Platform']
platform_df = platform_df.reset_index().drop('index', axis = 1)
platform_df.head(10)
df_EA = df[(df["Publisher"] == "Electronic Arts")]
xaction=df_EA[df_EA.Genre=="Action"]
xsports=df_EA[df_EA.Genre=="Sports"]
xmisc=df_EA[df_EA.Genre=="Misc"]
xrole=df_EA[df_EA.Genre=="Role-Playing"]
xshooter=df_EA[df_EA.Genre=="Shooter"]
xadventure=df_EA[df_EA.Genre=="Adventure"]
xrace=df_EA[df_EA.Genre=="Racing"]
xplatform=df_EA[df_EA.Genre=="Platform"]
xsimulation=df_EA[df_EA.Genre=="Simulation"]
xfight=df_EA[df_EA.Genre=="Fighting"]
xstrategy=df_EA[df_EA.Genre=="Strategy"]
xpuzzle=df_EA[df_EA.Genre=="Puzzle"]
trace1 = go.Histogram(
x=xaction.Platform,
opacity=0.75,
name = "Action",
marker=dict(color='rgb(165,0,38)'))
trace2 = go.Histogram(
x=xsports.Platform,
opacity=0.75,
name = "Sports",
marker=dict(color='rgb(215,48,39)'))
trace3 = go.Histogram(
x=xmisc.Platform,
opacity=0.75,
name = "Misc",
marker=dict(color='rgb(244,109,67)'))
trace4 = go.Histogram(
x=xrole.Platform,
opacity=0.75,
name = "Role Playing",
marker=dict(color='rgb(253,174,97)'))
trace5 = go.Histogram(
x=xshooter.Platform,
opacity=0.75,
name = "Shooter",
marker=dict(color='rgb(254,224,144)'))
trace6 = go.Histogram(
x=xadventure.Platform,
opacity=0.75,
name = "Adventure",
marker=dict(color='rgb(170,253,87)'))
trace7 = go.Histogram(
x=xrace.Platform,
opacity=0.75,
name = "Racing",
marker=dict(color='rgb(171,217,233)'))
trace8 = go.Histogram(
x=xplatform.Platform,
opacity=0.75,
name = "Platform",
marker=dict(color='rgb(116,173,209)'))
trace9 = go.Histogram(
x=xsimulation.Platform,
opacity=0.75,
name = "Simulation",
marker=dict(color='rgb(69,117,180)'))
trace10 = go.Histogram(
x=xfight.Platform,
opacity=0.75,
name = "Fighting",
marker=dict(color='rgb(49,54,149)'))
trace11 = go.Histogram(
x=xstrategy.Platform,
opacity=0.75,
name = "Strategy",
marker=dict(color="rgb(10,77,131)"))
trace12 = go.Histogram(
x=xpuzzle.Platform,
opacity=0.75,
name = "Puzzle",
marker=dict(color='rgb(1,15,139)'))
trace = [trace1, trace2,trace3,trace4,trace5,trace6,trace7,trace8,trace9,trace10,trace11,trace12]
layout = go.Layout(barmode='stack',
title='Electronic Arts Genre Counts According to Platform',
xaxis=dict(title='Platform'),
yaxis=dict( title='Count'),
paper_bgcolor='beige',
plot_bgcolor='beige'
)
fig = go.Figure(data=trace, layout=layout)
iplot(fig)
Now let's check the yearly global sales of Electronic Arts by Genres using heatmap
pivot = df_EA.pivot_table(index="Genre",
values="Global_Sales",
columns="year",
aggfunc="sum")
plt.figure(figsize=(30,10))
sns.heatmap(pivot, cmap="Blues", annot=True, fmt=".0f")
Now Let's check their most renowned game FIFA by Platform
df_fifa = df[df['Name'].str.match('FIFA')]
df_fifa['Name'].unique()
pivot_fifa = df_fifa.pivot_table(index="Platform",
values="Global_Sales",
columns="Name",
aggfunc="sum")
plt.figure(figsize=(35,12))
sns.heatmap(pivot_fifa, cmap="Blues", annot=True, fmt=".0f")
Let's check the time series plot of FIFA Global Sales
fifa_global_sales_yearly = df_fifa.groupby(by=['year'], as_index=False)['Global_Sales'].sum()
plt.figure(figsize=(20,5))
plt.plot(fifa_global_sales_yearly.year, fifa_global_sales_yearly.Global_Sales)
plt.show()
# Checking Correlation
# df_EA = df[(df["Publisher"] == "Electronic Arts")]
sns.heatmap(df_EA.corr(),cbar=True,annot=True)
#df_EA.head()
features = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales']#our features
X = df_EA[features]
y = df_EA.Global_Sales #target
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=1)
DTmodel = DecisionTreeRegressor(random_state=1)
DTmodel.fit(X_train,y_train)
predictions = DTmodel.predict(X_test)
mae = mean_absolute_error(predictions, y_test)
mse = mean_squared_error(predictions, y_test)
print('Mean Absolute Error: '+str(mae))
print('Mean Squared Error: '+str(mse))
pred_EA_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
pred_EA_df.head()
score = DTmodel.score(X_test, y_test)
score
features = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales']#our features
X = df_EA[features]
y = df_EA.Global_Sales #target
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=1)
RFmodel = RandomForestRegressor(n_estimators=10, random_state = 0)
RFmodel.fit(X_train, y_train)
predictions = RFmodel.predict(X_test)
mae = mean_absolute_error(predictions, y_test)
mse = mean_squared_error(predictions, y_test)
print('Mean Absolute Error: '+str(mae))
print('Mean Squared Error: '+str(mse))
pred_EA_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
pred_EA_df.head()
score = RFmodel.score(X_test, y_test)
score
from sklearn.preprocessing import LabelEncoder
cat_var = ['Platform', 'Genre', 'Name']
num_var = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']
LE = LabelEncoder()
label_encode = pd.DataFrame(columns=['Platform', 'Genre', 'Name','Global_Sales'])
# for var in cat_var:
# col = df_EA[var]
# new_col = LE.fit_transform(col)
# label_encode[var] = new_col
# for var in num_var:
# label_encode[var] = df_EA[var].values
# label_encode.head()
def encoding(cat_var, num_var):
for var in cat_var:
col = df_EA[var]
new_col = LE.fit_transform(col)
label_encode[var] = new_col
for var in num_var:
label_encode[var] = df_EA[var].values
return(label_encode)
ridge_df = encoding(cat_var, num_var)
X = ridge_df[['Platform', 'Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
y = ridge_df['Global_Sales']
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.25,random_state=1)
from sklearn.linear_model import Ridge
ridge = Ridge()
ridgeModel = ridge.fit(X_train, y_train)
predictions = ridgeModel.predict(X_test)
mae = mean_absolute_error(predictions, y_test)
mse = mean_squared_error(predictions, y_test)
print('Mean Absolute Error: '+str(mae))
print('Mean Squared Error: '+str(mse))
pred_EA_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
pred_EA_df.head()
score = ridgeModel.score(X_test, y_test)
score